airplane_df = read_csv("datasets/airplane_crashes.csv") |>
janitor::clean_names() |>
filter(ground != "NULL", aboard != "NULL") |>
# removes an unnecessary column
select(-flight_number, -fatalities_passangers, -fatalities_crew, -aboard_passangers, -aboard_crew) |>
drop_na(date, time, operator, route, aboard, fatalities, registration, cn_ln, ground, summary)
## Rows: 4967 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Date, Time, Location, Operator, Flight #, Route, AC Type, Registra...
## dbl (1): Fatalities
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Creating a datetime column.
airplane_df = airplane_df |>
mutate(
# remove leading/trailing spaces
time = str_trim(time),
# replace the invalid times with NA
time = ifelse(time %in% c("91:5", "90:0"), NA, time),
# combine the cleaned date and time columns into datetime
datetime = mdy_hm(paste(date, time))
) |>
# remove any rows that could not be parsed
drop_na(datetime)
Converting variables to their proper variable types.
airplane_df = airplane_df |>
mutate(
year = year(datetime),
month = month(datetime),
month_name = month(datetime, label = TRUE),
aboard = as.numeric(aboard),
fatalities = as.numeric(ground),
operator = as.factor(operator) # to group by operator
) |>
select(-date, -time)
Creating a decade column, now that year is numeric.
airplane_df = airplane_df |>
mutate(
decade = floor(year / 10) * 10,
decade = paste0(decade, "s")
) |>
select(datetime, year, decade, month, month_name, everything())
Crashes per year?
# ggplot
airplane_df |>
group_by(year) |>
summarize(total_crashes = n()) |>
ggplot(aes(x = year, y = total_crashes)) +
geom_line() +
geom_point() +
geom_smooth(span = 0.2, color = "red", se = FALSE) +
labs(
title = "Airplane Crashes per Year",
x = "Year",
y = "Number of Crashes"
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# interactive plot with plotly
airplane_df |>
count(year) |>
plot_ly(
x = ~year,
y = ~n,
type = "scatter",
mode = "lines+markers",
hovertemplate = "Year: %{x}<br>Crashes: %{y}<extra></extra>"
) |>
layout(
title = list(
text = "Airplane Crashes per Year",
font = list(size = 20)
),
xaxis = list(
title = list(
text = "Year",
font = list(size = 16)
)
),
yaxis = list(
title = list(
text = "Number of Crashes",
font = list(size = 16)
)
)
)
Seasonal trends, combining all years?
airplane_df |>
group_by(month_name) |>
summarize(total_crashes = n()) |>
ggplot(aes(x = month_name, y = total_crashes)) +
geom_col(fill = "blue") +
labs(title = "Airplane Crashes by Month")
Top airlines with crashes?
airplane_df |>
group_by(operator) |>
summarise(total_crashes = n(), .groups = "drop") |>
slice_max(total_crashes, n = 15) |> # select the top 15 operators
ggplot(aes(x = reorder(operator, total_crashes), y = total_crashes )) +
geom_col(fill = "blue") +
coord_flip() +
labs(
title = "Top 15 Airline Operators with the Most Crashes",
x = "Airline Operators",
y = "Number of Crashes"
) +
theme(
axis.title.y = element_text(margin = margin(r = 20))
)
Avg fatalities per year?
airplane_df |>
group_by(year) |>
summarise(avg_fatalities = mean(fatalities, na.rm = TRUE)) |>
ggplot(aes(x = year, y = avg_fatalities)) +
geom_line() +
geom_point() +
labs(title = "Average Fatalities per Crash per Year")
# heatmap of crashes by year and month
airplane_df |>
count(decade, month_name) |> # counts crashes in the decade/month
ggplot(aes(x = decade, y = month_name, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Heatmap of Airplane Crashes by Year and Month",
x = "Deacde",
y = "Month",
fill = "Number of Crashes")